#
# Organize mobility regressions by wealth and income
# 

#==============================================
# Preamble
#==============================================


START_TIME = Sys.time()

scripts.dir <- dirname(sys.frame(1)$ofile)
print(scripts.dir)
setwd(scripts.dir)

setwd('..')
in.dir = paste(getwd(), 'data', sep='/')
out.dir = paste(getwd(), 'output', sep='/')
log.dir = paste(getwd(), 'log', sep='/')

library('reshape')
library('dplyr')
library('stringr')

options(stringsAsFactors=FALSE)

#==============================================
# Read in estimates
#==============================================

print(in.dir)
setwd(in.dir)

files.0 = data.frame(f.n=list.files())
files.0$one = 1

files.1 = files.0[grepl('^track_mob_6_[a-z0-9_]{1,20}.csv', files.0$f.n), ]

data.10 = NULL
stats.10 = NULL

for(i in 1:nrow(files.1)){
  print(in.dir)
  setwd(in.dir)
  fn = files.1$f.n[i]
  data.0 = as.matrix(read.csv(fn))
  data.0[,] = gsub('^=', '', data.0[,])
  
  consrow = (1:nrow(data.0))[data.0[, 1] == '_cons'][1]
  data.1 = as.data.frame(t(data.0[2:consrow, 2:ncol(data.0)]))
  
  colnames(data.1) = data.0[2:consrow, 1]
  data.1$spec = c(data.0[1, 2:ncol(data.0)])
  
  
  for(j in 1:nrow(data.1)){
    if(nchar(gsub('\\s+', '', data.1$spec[j])) == 0){
      data.1$spec[j] = data.1$spec[j-1]
    }
  }
  
  data.1$stat = ifelse((1:nrow(data.1) %% 2) == 1, 'b', 'se')
  data.2 = as.data.frame(melt(data.1, id.vars=c('spec', 'stat')))
  names(data.2) = c('spec', 'stat', 'x', 'cell.str')
  data.2$x = as.character(data.2$x)
  
  reg.0 = '([a-z0-9_-]{1,30})::([a-z0-9_-]{1,20})::([a-z0-9_-]{1,20})(::([a-z0-9_-]{1,10})::([a-z0-9_-]{1,10}))?'
  a = str_match(data.2$spec, reg.0)
  data.2$y = a[, 2]
  data.2$target = a[, 3]
  data.2$fe = a[, 4]
  data.2$flex = ifelse(is.na(a[, 6]), 'none', a[, 6])
  data.2$samp = ifelse(is.na(a[, 7]), 'none', a[, 7])
  
  data.3 = data.2
  
  
  stats.0 = as.data.frame(t(data.0[(consrow+1):(nrow(data.0)-1), 2:ncol(data.0)]))
  colnames(stats.0) = data.0[(consrow+1):(nrow(data.0)-1), 1]
  
  stats.0$spec = data.1$spec
  stats.1 = stats.0[(1:nrow(data.1) %% 2) == 1, ]
  a = str_match(stats.1$spec, reg.0)
  stats.1$y = a[, 2]
  stats.1$target = a[, 3]
  stats.1$fe = a[, 4]
  stats.1$flex = ifelse(is.na(a[, 6]), 'none', a[, 6])
  stats.1$samp = ifelse(is.na(a[, 7]), 'none', a[, 7])
  
  stats.3 = stats.1
  
  stats.10 = rbind(stats.10, stats.3)
  data.10 = rbind(data.10, data.3)
}

data.10$tag = with(data.10, paste('tag', y, target, fe, flex, samp, spec, x, stat, 'end', sep='.'))
look.0 = data.10[order(data.10$tag), ]
look.1 = look.0[!duplicated(look.0$tag), ]

data.11 = as.data.frame(cast(look.1, y + target + fe + flex + samp + spec + x ~ stat,
                             value='cell.str'))
data.12 = data.11[data.11$x != 'N', ]
data.12$se.0 = with(data.12, as.numeric(gsub('[\\(\\)]', '', se)))
data.12$b.0 = with(data.12, as.numeric(b))

probs.0 = c(0.1, 0.05, 0.01)
probs.1 = abs(qnorm(probs.0 / 2))

#
# pretty coefficient strings
#
data.12$t.0 = with(data.12, abs(b.0 / se.0))
data.12$se.1 = with(data.12, formatC(as.numeric(round(se.0 * 1000) / 1000),
                                     format='f', flag='0', digits=3))
data.12$se.2 = with(data.12, ifelse(is.na(se.0), '',
                                    paste('="(', se.1, ')"', sep='')))
data.12$stars = as.character(cut(data.12$t.0, breaks=c(-Inf, probs.1, Inf),
                               labels=c('', '*', '**', '***')))
data.12$b.1 = with(data.12, formatC(as.numeric(round(b.0 * 1000) / 1000),
                                    format='f', flag='0', digits=3))
data.12$b.2 = with(data.12, ifelse(is.na(se.0), '',
                                   paste('="', b.1, stars, '"', sep='')))

data.13 = data.12[, 1:7]
data.13$b = data.12$b.2
data.13$se = data.12$se.2

stats.10$tag = with(stats.10, paste('tag', y, samp, target, fe, flex, 'end', sep='.'))
rownames(stats.10) = stats.10$tag

tmp1 = with(data.12, paste('tag', y, samp, target, fe, flex, 'end', sep='.'))

data.13$ymean = stats.10[tmp1, 'mn_out']


id.vars.0 = c('y', 'target', 'fe', 'flex', 'samp', 'spec', 'x')
measure.vars.0 = c('b', 'se', 'ymean')
data.14 = as.data.frame(melt(data.13, id.vars=id.vars.0,
                             measure.vars=measure.vars.0))
names(data.14) = c(names(data.14)[1:7], 'stat', 'cell.str')
data.14$pred_pct = gsub('_[a-zA-Z_]{1,20}[0-9]{1,10}$', '', data.14$y)
data.14$y_var = gsub('^mob_[0-9]{2}_', '', data.14$y)
data.14 = data.14 %>%
  mutate(col.prio = case_when(
    fe == 'y' ~ 1,
    fe == 'yd' ~ 2,
    fe == 'none' ~ 3,
    fe == 'flex' ~ 4,
    TRUE ~ 5
  ))


reg.0 = 'trk_((unadj)|(rel))_((cy)|(dy))_([dba]m)'
a = str_match(data.14$target, reg.0)
data.14$meas = a[, 2]
data.14$instr.lvl = a[, 5]
data.14$instr.samp = a[, 8]

#==============================================
# Results tables
#==============================================

stage2.00 = data.14[with(data.14, grepl('trk_((elem)|(mid))_x$', x) &
                           (fe == 'yc') &
                           (instr.samp != 'bm') &
                           (!((instr.lvl == 'cy') & (instr.samp == 'am'))) &
                           (!((instr.lvl == 'dy') & (instr.samp == 'dm'))) &
                           (!(((x == 'trk_elem_x') |
                                 (instr.lvl == 'dy')) &
                                (stat == 'ymean')))), ]

y.order.0 = c('scoreresid1'=1, 'scoreresid2'=2, 'scoreresid3'=3, 'scoreresid4'=4)

stage2.0 = stage2.00[grepl('scoreresid[0-9]', stage2.00$y) == T, ]

stage2.0$out.name = substr(stage2.0$y, 8, 30)
stage2.0$pct.in = as.numeric(substr(stage2.0$y, 5, 6))
x.order.0 = c('trk_elem_x'=1, 'trk_mid_x'=2, 'trk_elem_x2'=3, 'trk_mid_x2'=4)
x.order.1 = c('b'=1, 'se'=2, 'ymean'=3)
stage2.0$row.order = (10000 * y.order.0[stage2.0$out.name]) +
  (100 * x.order.0[stage2.0$x]) + x.order.1[stage2.0$stat]

# 
# order the columns properly
# 
spec.order.0 = c('unadj'=1, 'rel'=2)
spec.order.1 = c('dm'=1, 'am'=2, 'bm'=3)
spec.order.2 = c('cy'=1, 'dy'=2)
spec.order.3 = c('samp0'=1, 'samp1'=2, 'samp2'=3, 'samp3'=4)
spec.order.4 = c('mob_25'=1, 'mob_75'=2, 'mob_50'=3)

stage2.0$col.order.num =
  (1000000 * spec.order.4[stage2.0$pred_pct]) +
  (100000000 * spec.order.0[stage2.0$meas]) +
  (10000 * stage2.0$pct.in) +
  (100 * spec.order.1[stage2.0$instr.samp]) +
  (1 * spec.order.2[stage2.0$instr.lvl]) +
  (1000000000 * spec.order.3[stage2.0$samp])
stage2.0$col.order = paste('c', stage2.0$col.order.num, sep='')
stage2.1 = as.data.frame(cast(stage2.0, flex + row.order + out.name + x + stat ~
                                col.order + samp + meas + pct.in +
                                instr.samp + instr.lvl + fe,
                              value='cell.str'))
for(k in 1:7){
  stage2.1[, paste('gap', k, sep='.')] = NA
}

# include appropriate gaps between rows
a0 = c(1000, 1, 2, 1000, 3, 4, 1000, 5, 1000)
a1 = c()
for(i in 0:20){
  a1 = c(a1, a0 + (5 * i))
}
stage2.2 = stage2.1[a1, ]

print(out.dir)
setwd(out.dir)

write.csv(stage2.2, 'track_mob_7.csv', na='', row.names=F)

#==============================================
# Postamble
#==============================================

END_TIME = Sys.time()
print("Time elapsed in steps reorg:")
print(END_TIME - START_TIME)



